df <- read_xlsx("Sleepy lizard.xlsx")
hem <- df %>%
dplyr::select("Tot_WBC", "Het_ABS", "Lym_ABS",
"H:L Ratio", "Mon_ABS", "OthG_ABS")
env <- df[,4:7]
env$Treatment <- c("normal", "modified")[env$Treatment %>% match(1:2)]
env$Habitat <- c("normal", "swale plantings", "fields with crops", "under fallow")[env$Habitat %>% match(1:4)]
env$`Landscape ID` <- c("LS1", "LS2", "LS3"
)[env$`Landscape ID` %>% match(unique(env$`Landscape ID`))]
hem %>%
as.data.frame %>%
pivot_longer(cols = 1:6) %>%
ggplot(aes(name, value)) +
geom_boxplot(outliers = F) +
geom_jitter(size = .1) +
theme_minimal()
OK, standardize data and center it
center <- function(x) (x - mean(x)) #/ sd(x)
log10_1 <- function(x) log10(x+1)
hem <- hem %>%
apply(2, log10_1) %>%
#apply(2, scale) %>%
apply(1, center) %>%
apply(1, center) %>%
as.data.frame()
hem %>%
pivot_longer(cols = 1:6) %>%
ggplot(aes(y = value)) +
geom_boxplot(outliers = F, aes(x = 0), width = .5) +
geom_density() +
facet_grid(~name, scales = "free") +
theme_minimal()
Not totally normal distributions, but near them
batch <- df$LBSI %>% center
## measuring its influence
model_batch <- glm(
batch ~
hem$Tot_WBC + hem$Het_ABS + hem$Lym_ABS +
hem$`H:L Ratio` + hem$Mon_ABS + hem$OthG_ABS
)
plot(model_batch)
14, 81 and 85 ID is marked as outliers. Check it later
Full:
hem %>%
cbind(env) %>%
cbind(batch) %>%
pivot_longer(cols = 1:6) %>%
mutate(value = value) %>%
ggplot(mapping = aes("", value)) +
geom_boxplot(aes(fill = Treatment), outliers = F) +
geom_jitter(aes(color = batch), size = .2, alpha = .5) +
theme_minimal() +
facet_wrap(~name, scales = "free") +
scale_color_viridis_c()
3 out of 6 categories seems to differ. In contrast, without performing double-centralization, 3-4 categories varied.
Is batch equal?
env %>%
cbind(batch) %>%
ggplot(mapping = aes(Treatment, batch)) +
geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) +
theme_minimal()
OK, the batch is nearly equal, but can be reduced. I choose MANOVA approach for predict. First of all, let’s remove outliers
df_pcoa <- hem %>%
prcomp()
plot(df_pcoa)
OK, not bad distribution
gg <- df_pcoa[["x"]] %>%
cbind(env) %>%
mutate (LID = as.character(`Landscape ID`)) %>%
mutate(HBT = as.character(Habitat)) %>%
ggplot(aes(PC1, PC2, color = LID, shape = HBT, text = paste0("ID: ", 1:122))) +
facet_grid(~Treatment) +
geom_point() +
theme_minimal()
ggplotly(gg)
Without logarithmization (with usage of only the centering approach), the variance of two groups become unequal
gg <- df_pcoa[["x"]] %>%
cbind(env) %>%
cbind(batch) %>%
mutate (LID = as.character(`Landscape ID`)) %>%
mutate(HBT = as.character(Habitat)) %>%
ggplot(aes(PC1, PC2, color = batch, shape = Treatment, text = paste0("ID: ", 1:122))) +
#facet_grid(~Treatment) +
scale_color_viridis_c(name = "LBSI") +
geom_point(size = 1, alpha = .7) +
theme_minimal()
ggplotly(gg)
OK. Seems like lizards from unmodified areas have a bigger spread without row-wise standartization. With this procedure, nothing is related to their groupping. Also, there is ~no groupping or gradient according to LBSI
df_PCO <-dfF[,1:6] %>%
vegdist(method = "euclidean") %>%
betadisper(dfF$Treatment)
plot(df_PCO)
anova(df_PCO)
## Analysis of Variance Table
##
## Response: Distances
## Df Sum Sq Mean Sq F value Pr(>F)
## Groups 1 0.395 0.39473 0.9336 0.3359
## Residuals 120 50.733 0.42278
Groups residuals are equal now (with scaling with different datawizard package functions it was not true)
df_adonis <- adonis2(hem ~ dfF$Treatment*batch, method = "euclidean")
df_adonis
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = hem ~ dfF$Treatment * batch, method = "euclidean")
## Df SumOfSqs R2 F Pr(>F)
## dfF$Treatment 1 35.458 0.24279 38.2035 0.001 ***
## batch 1 0.521 0.00357 0.5614 0.583
## dfF$Treatment:batch 1 0.543 0.00372 0.5846 0.545
## Residual 118 109.519 0.74992
## Total 121 146.040 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The difference is significant. But what what exactly is the reason of the differences?
dfF$target <- as.factor(dfF$Treatment)
df_glm <- glm(data = dfF,
formula = target ~
(Tot_WBC + Het_ABS + Lym_ABS +
`H:L Ratio` + Mon_ABS + OthG_ABS)*batch,
family = "binomial")
## Warning: glm.fit: возникли подогнанные вероятности 0 или 1
summary(df_glm)
##
## Call:
## glm(formula = target ~ (Tot_WBC + Het_ABS + Lym_ABS + `H:L Ratio` +
## Mon_ABS + OthG_ABS) * batch, family = "binomial", data = dfF)
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.722 2.019 -3.330 0.000868 ***
## Tot_WBC -29.958 15.847 -1.891 0.058689 .
## Het_ABS -1.510 10.301 -0.147 0.883437
## Lym_ABS -12.282 4.657 -2.637 0.008352 **
## `H:L Ratio` -10.392 3.351 -3.101 0.001928 **
## Mon_ABS -9.322 2.733 -3.412 0.000646 ***
## OthG_ABS NA NA NA NA
## batch -8.561 27.148 -0.315 0.752502
## Tot_WBC:batch 217.299 365.399 0.595 0.552050
## Het_ABS:batch -238.897 242.847 -0.984 0.325247
## Lym_ABS:batch -9.291 121.280 -0.077 0.938936
## `H:L Ratio`:batch 49.846 63.896 0.780 0.435320
## Mon_ABS:batch 65.145 68.589 0.950 0.342220
## OthG_ABS:batch NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 136.100 on 121 degrees of freedom
## Residual deviance: 23.312 on 110 degrees of freedom
## AIC: 47.312
##
## Number of Fisher Scoring iterations: 9
IDK why is it broken on OthG var
dfH <- dfF %>% subset(Treatment == 'modified')
df_PCO <- dfH[,1:6] %>%
vegdist(method = "euclidean") %>%
betadisper(dfH$Habitat)
plot(df_PCO)
anova(df_PCO)
## Analysis of Variance Table
##
## Response: Distances
## Df Sum Sq Mean Sq F value Pr(>F)
## Groups 2 0.315 0.15738 0.3125 0.7324
## Residuals 89 44.828 0.50369
Groups residuals are equal now (with scaling with different datawizard package functions it was not true)
df_adonis <- adonis2(dfH[,1:6] ~ dfH$Habitat*dfH$batch, method = "euclidean")
df_adonis
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = dfH[, 1:6] ~ dfH$Habitat * dfH$batch, method = "euclidean")
## Df SumOfSqs R2 F Pr(>F)
## dfH$Habitat 2 2.650 0.02825 1.2819 0.261
## dfH$batch 1 0.745 0.00794 0.7209 0.479
## dfH$Habitat:dfH$batch 2 1.514 0.01614 0.7322 0.560
## Residual 86 88.900 0.94767
## Total 91 93.810 1.00000
The difference is not significant across different habitats.
df_adonis <- adonis2(dfH[,1:6] ~ dfH$Connectivity*dfH$batch, method = "euclidean")
df_adonis
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = dfH[, 1:6] ~ dfH$Connectivity * dfH$batch, method = "euclidean")
## Df SumOfSqs R2 F Pr(>F)
## dfH$Connectivity 1 0.992 0.01058 0.9538 0.420
## dfH$batch 1 0.923 0.00984 0.8869 0.410
## dfH$Connectivity:dfH$batch 1 0.341 0.00363 0.3274 0.728
## Residual 88 91.554 0.97595
## Total 91 93.810 1.00000
The difference is also not significant
df_adonis <- adonis2(dfH[,1:6] ~ dfH$Connectivity*dfH$Habitat*dfH$batch, method = "euclidean")
df_adonis
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = dfH[, 1:6] ~ dfH$Connectivity * dfH$Habitat * dfH$batch, method = "euclidean")
## Df SumOfSqs R2 F Pr(>F)
## dfH$Connectivity 1 0.992 0.01058 0.9402 0.368
## dfH$Habitat 2 2.609 0.02781 1.2358 0.299
## dfH$batch 1 0.861 0.00918 0.8162 0.427
## dfH$Connectivity:dfH$Habitat 2 0.959 0.01022 0.4542 0.789
## dfH$Connectivity:dfH$batch 1 0.241 0.00257 0.2287 0.825
## dfH$Habitat:dfH$batch 2 1.422 0.01516 0.6738 0.634
## dfH$Connectivity:dfH$Habitat:dfH$batch 2 2.290 0.02441 1.0847 0.369
## Residual 80 84.435 0.90007
## Total 91 93.810 1.00000
Also, no visible relations. It might be better if I somehow balance variables, but not in now.
dfH$target <- as.factor(paste(dfH$Habitat, dfH$Connectivity))
df_glm <- glm(data = dfH,
formula = target ~
(Tot_WBC + Het_ABS + Lym_ABS +
`H:L Ratio` + Mon_ABS + OthG_ABS),
family = "binomial")
summary(df_glm)
##
## Call:
## glm(formula = target ~ (Tot_WBC + Het_ABS + Lym_ABS + `H:L Ratio` +
## Mon_ABS + OthG_ABS), family = "binomial", data = dfH)
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.468 1.414 3.159 0.00159 **
## Tot_WBC 52.135 26.344 1.979 0.04782 *
## Het_ABS -34.319 16.411 -2.091 0.03650 *
## Lym_ABS -13.039 8.389 -1.554 0.12011
## `H:L Ratio` 6.336 3.463 1.830 0.06729 .
## Mon_ABS -1.675 2.436 -0.687 0.49178
## OthG_ABS NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 44.360 on 91 degrees of freedom
## Residual deviance: 35.191 on 86 degrees of freedom
## AIC: 47.191
##
## Number of Fisher Scoring iterations: 7
plot(df_glm)
Several outliers could be removed to improve both GLM and perMANOVA
dfH <- dfH[!(rownames(dfH) %in% c(70, 91, 96, 119)),]
df_glm <- glm(data = dfH,
formula = target ~
(Tot_WBC + Het_ABS + Lym_ABS +
`H:L Ratio` + Mon_ABS + OthG_ABS),
family = "binomial")
## Warning: glm.fit: алгоритм не сошелся
## Warning: glm.fit: возникли подогнанные вероятности 0 или 1
plot(df_glm)
## Warning in sqrt(crit * p * (1 - hh)/hh): созданы NaN
## Warning in sqrt(crit * p * (1 - hh)/hh): созданы NaN
df_adonis <- adonis2(dfH[,1:6] ~ dfH$Connectivity*dfH$Habitat*dfH$batch, method = "euclidean")
df_adonis
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = dfH[, 1:6] ~ dfH$Connectivity * dfH$Habitat * dfH$batch, method = "euclidean")
## Df SumOfSqs R2 F Pr(>F)
## dfH$Connectivity 1 0.956 0.01027 0.8700 0.417
## dfH$Habitat 2 2.703 0.02904 1.2297 0.324
## dfH$batch 1 0.935 0.01004 0.8506 0.422
## dfH$Connectivity:dfH$Habitat 2 0.834 0.00896 0.3795 0.837
## dfH$Connectivity:dfH$batch 1 0.206 0.00221 0.1870 0.848
## dfH$Habitat:dfH$batch 2 1.632 0.01754 0.7425 0.577
## dfH$Connectivity:dfH$Habitat:dfH$batch 2 2.270 0.02439 1.0327 0.375
## Residual 76 83.529 0.89754
## Total 87 93.065 1.00000
Still, no connection using AOV, so GLM might have a noisy features for the prediction.